# -*- coding:utf-8 -*-
# @Time:2024/4/1921:01
# @Author:miuzg
# @FileName:new test2.py
# @Software:PyCharm
from lxml import etree

html = """
<!DOCTYPE html>
<html>
<head>
    <title>Simple HTML Example</title>
</head>
<body>
    <ul>
        <li>Item 1 <span>Additional Info 1</span></li>
        <li>Item 2 <span>Additional Info 2</span></li>
        <li>Item 3 <span>Additional Info 3</span></li>
        <li class="item-5"><a href="https://www.sina.com.cn/" title="新浪">新浪</a><span>新闻网站</span></li>
        <li class="item-6"><a href="https://www.qq.com/" title="腾讯">腾讯</a><span>社交娱乐网站</span></li>
        <li class="item-7"><a href="https://www.zhihu.com/" title="知乎">知乎</a><span>知识问答网站</span></li>
    </ul>
</body>
</html>
"""

# 把字符串交给etree.让他以html的方式解析数据
html = etree.HTML(html)

# 获取所有li标签
li = html.xpath('//li')
print(li)
print('*'*50)

# 获取li里的数据
for i in li:
    # 把内容转为字符串
    print(etree.tostring(i).decode('utf-8'))

# 获取li的所有class值
class_data = html.xpath('//li/@class')
print(class_data)
print('*'*50)

# 获取li里的span内容
span = html.xpath('//li/span/text()')
print(span)
print('*'*50)

# 获取li里的a内容
a = html.xpath('//li/a/text()')
print(a)
print('*'*50)

# 获取li里a里的href值
href = html.xpath('//li/a/@href')
print(href)
print('*'*50)

# 获取网址为qq的元素
baidu = html.xpath('//li/a[@href="https://www.qq.com/"]/text()')[0]
print(baidu)
print('*'*50)

# 获取最后i个li里的内容
last_li = html.xpath('//li[last()]//text()')
print(last_li)
print('*'*50)

# 获取第二个li里的内容，下标是从1开始地
sec_li = html.xpath('//li[2]/text()')
print(sec_li)